# =============================================================================
# Safegraph Analysis
# This code is for replicating: Figure 2.
# 
# Create the datasets for Proximity Analysis
#   how far away from a plasma center do people live? Create the CDF of donor travel distance.
# =============================================================================

#Import core data
core = pd.read_parquet(cdd['p_d_sg_sample']+r'\core_cleaned_sample.parquet')
ptm = pd.read_parquet(cdd['p_d_sg_sample'] + r'\pattern_cleaned_sample.parquet')
core = core[core.placekey.isin(ptm.placekey.unique())]
# manual = pd.read_excel(cdd['p_d_sg_sample'] + r'\Crosswalk\manual.xlsx') #this file is not provided by the authors.


# #Select only plasma centers (the core dataset would have this variable already)
# core_p=core[core.plasma==1].copy()


# #Merge Pattern to Core
# cvars = ['placekey','placekeyp','sgid','id','name','brand','naics','state','sqft','parking','lat','long']
# ldiff(list(core),cvars)
# lint(list(ptm),cvars)

# ptm = pd.merge(ptm,core_p[cvars],how='inner',on=['placekey'])
# ptm.info(memory_usage='deep')
# ptm.to_parquet(cdd['p_d_sg_pva'] + r'\ptm_Plas_Slim_aug.parquet')



# #Go through the cbgv, cbgd, and tractv columns and add all cbgs, and tracts within 25 kilometers with a zero as an outcome.
# #Import and stitch together the 2016 Census Block Group geographies (used by Safegraph -- see their FAQ)
geo_year=2019
# state_codes = {
#     'WA': '53', 'DE': '10', 'DC': '11', 'WI': '55', 'WV': '54', 'HI': '15',
#     'FL': '12', 'WY': '56', 'PR': '72', 'NJ': '34', 'NM': '35', 'TX': '48',
#     'LA': '22', 'NC': '37', 'ND': '38', 'NE': '31', 'TN': '47', 'NY': '36',
#     'PA': '42', 'AK': '02', 'NV': '32', 'NH': '33', 'VA': '51', 'CO': '08',
#     'CA': '06', 'AL': '01', 'AR': '05', 'VT': '50', 'IL': '17', 'GA': '13',
#     'IN': '18', 'IA': '19', 'MA': '25', 'AZ': '04', 'ID': '16', 'CT': '09',
#     'ME': '23', 'MD': '24', 'OK': '40', 'OH': '39', 'UT': '49', 'MO': '29',
#     'MN': '27', 'MI': '26', 'RI': '44', 'KS': '20', 'MT': '30', 'MS': '28',
#     'SC': '45', 'KY': '21', 'OR': '41', 'SD': '46', "AS": "60", "MP": "69",
#     "VI": "78", "GU": "66"
# }

# #Import all of the shapefiles.(Note the shapefiles already have the correct CRS saved in them.)
# cbg = []
# for s in list(state_codes.values()):
#     temp=('tl_2010_'+s+'_bg00') if geo_year==2000 else ('tl_'+str(geo_year)+'_'+s+'_bg')
#     file=r'/BG_'+str(geo_year)+'/'+temp+'/'+temp+'.shp'
#     try:
#         cbg += [gpd.read_file(cdd['geo_t']+file)]
#     except Exception:
#         pass
# cbg = pd.concat(cbg,ignore_index=True,sort=False)
# # cbg.crs
# if geo_year==2000:
#     cbg=cbg.rename(columns={'BKGPIDFP00':'GEOID','ALAND00':'ALAND','AWATER00':'AWATER'}).drop(columns=['STATEFP00', 'COUNTYFP00', 'TRACTCE00', 'BLKGRPCE00', 'NAMELSAD00', 'MTFCC00', 'FUNCSTAT00', 'INTPTLAT00', 'INTPTLON00'])
# if geo_year!=2000:
#     cbg=cbg.drop(columns=['STATEFP', 'COUNTYFP', 'TRACTCE', 'BLKGRPCE', 'NAMELSAD', 'MTFCC', 'FUNCSTAT', 'INTPTLAT', 'INTPTLON'])

# #Generate a column with the cbg identifier (as numeric)
# cbg['cbg']=pd.to_numeric(cbg['GEOID'])
# cbg['fips']=np.floor(cbg['cbg']/10000000)

# #For each cbg find the centroid
# #       Apparently, the centroid is fine for small shapes but for large shapes it epsg4269 places extra weight on area closer to the poles
# #       For a solution with large geographies see https://gis.stackexchange.com/questions/372564/userwarning-when-trying-to-get-centroid-from-a-polygon-geopandas
# cbg['lat']=cbg.geometry.centroid.y
# cbg['long']=cbg.geometry.centroid.x
# cbg['centroid']=cbg.geometry.centroid
# #In the 6933 projection the area is in meters sq, same as aland and awater.
# cbg['area']=cbg['geometry'].to_crs('epsg:6933').area
# cbg.info(memory_usage='deep')
# cbg['tract'] = cbg['GEOID'].str[:11]
# cbg['state'] = cbg['GEOID'].str[:2]
# cbg.to_pickle(cdd['p_d_geo'] + r'\cbg_geo'+str(geo_year)+'.pkl')
# tract = cbg[['geometry','tract']].dissolve(by='tract')
# tract['centroid']=tract.geometry.centroid
# tract['lat']=tract.geometry.centroid.y
# tract['long']=tract.geometry.centroid.x
# tract.to_pickle(cdd['p_d_geo'] + r'\tract_geo'+str(geo_year)+'.pkl')

# =============================================================================
#For each location in ptm find all census tracts and block groups within 25km.
cbg = pd.read_parquet(cdd['p_d_geo'] + r'\cbg_geo'+str(geo_year)+'.parquet')
tract = pd.read_pickle(cdd['p_d_geo'] + r'\tract_geo'+str(geo_year)+'.pkl').reset_index()

cbgpt_df = pd.DataFrame(cbg[['GEOID','long','lat']])
tractpt_df = pd.DataFrame(tract[['tract','long','lat']])

def within(shape,skey,tgt,buffer,method='fast'):
    ''' 
    Passing a geopandas dataframe in shape takes forever because shape is not pickleable
    method='slow'  ----- only works with a geopandas dataframe for shape which is the bottleneck
    '''
    if method=='slow':
        tgtpt = gpd.GeoDataFrame(geometry=[Point(tgt['long'],tgt['lat'])], crs=4269).to_crs('epsg:6933')
        tgtpt['buffer'] = tgtpt['geometry'].buffer(buffer)
        tgtpt=tgtpt.set_geometry('buffer')
        res=gpd.sjoin(shape,tgtpt,op = 'within')
        return {**tgt,'results':list(res[skey].unique())}
    if method=='fast':
        #Each degree of lat or long is roughly 40 miles or 80 km so to get a very rough search grid around a point of radius r (in km) I need the degree difference to be less than r/100
        degdist = buffer/100000
        #Get a very preliminary list of candidates (filter by the degree distance)
        temp=shape[(abs(shape.lat-tgt['lat'])<degdist)&(abs(shape.long-tgt['long'])<degdist)].copy()
        if temp.shape[0]>0:
            #Calculate the actual distance using the vincenty distance formula (sort by distance)
            temp['distance']=temp.apply(lambda x: vinc.vincenty_inverse((tgt['lat'],tgt['long']),(x['lat'],x['long'])).m,axis=1)
            #temp['distance']=temp.apply(lambda x: distance.distance((tgt['lat'],tgt['long']),(x['lat'],x['long'])).m,axis=1)
            temp=temp[temp['distance']<buffer].sort_values(by='distance').reset_index(drop=True)
            return {**tgt,'results':list(temp[skey].unique())}
        if temp.shape[0]==0:
            return {**tgt,'results':[]}

#List of target points
from shapely.geometry import Point
targets = ptm[['placekey','long','lat']].dropna().drop_duplicates('placekey').to_dict(orient='records')


cbgs_missed = Parallel(n_jobs=20)(delayed(within)(shape=cbgpt_df, skey='GEOID', tgt=tgt, buffer=25000) for tgt in tqdm(targets))
cbgs_missed = pd.DataFrame(cbgs_missed)
tracts_missed = Parallel(n_jobs=20)(delayed(within)(shape=tractpt_df, skey='tract', tgt=tgt, buffer=25000) for tgt in tqdm(targets))
tracts_missed = pd.DataFrame(tracts_missed)


ptm = pd.merge(ptm,cbgs_missed.rename(columns={'results':'cbg_near'})[['placekey','cbg_near']],how='left',on='placekey')
ptm = pd.merge(ptm,tracts_missed.rename(columns={'results':'tract_near'})[['placekey','tract_near']],how='left',on='placekey')

ptm['cbgv'] = ptm.apply(lambda x: re.sub('{','{'+':0,'.join(['"'+v+'"' for v in ldiff(x['cbg_near'],eval(x['cbgv']).keys())])+':0' + (',' if len(x['cbgv'])>5 else ''),x['cbgv']) if len(ldiff(x['cbg_near'],eval(x['cbgv']).keys()))>0 else x['cbgv']   ,axis=1)
ptm['cbgd'] = ptm.apply(lambda x: re.sub('{','{'+':0,'.join(['"'+v+'"' for v in ldiff(x['cbg_near'],eval(x['cbgd']).keys())])+':0' + (',' if len(x['cbgd'])>5 else ''),x['cbgd']) if len(ldiff(x['cbg_near'],eval(x['cbgd']).keys()))>0 else x['cbgd']   ,axis=1)
ptm['tractv'] = ptm.apply(lambda x: re.sub('{','{'+':0,'.join(['"'+v+'"' for v in ldiff(x['tract_near'],eval(x['tractv']).keys())]) + ':0' + (',' if len(x['tractv'])>5 else ''),x['tractv']) if len(ldiff(x['tract_near'],eval(x['tractv']).keys()))>0 else x['tractv'] ,axis=1)
ptm.drop(columns=['cbg_near','tract_near'],inplace=True)

ptm.to_parquet(cdd['p_d_sg_pva'] + r'\ptm_Plas_Slim_aug2.parquet')
ptm = pd.read_parquet(cdd['p_d_sg_pva'] + r'\ptm_Plas_Slim_aug2.parquet')



# =============================================================================
#Create the files for proximity analysis
keys=['placekey', 'placekeyp', 'sgid', 'zip','cbg', 'state', 'date_e', 'id', 'name', 'brand','naics', 'sqft', 'parking', 'lat', 'long']

def my_explode(df,keys,target,val='values',method='flat',*args,**kwargs):
    '''    Code to explode a column of text which contains flat dictionaries (eg. {k1:v1,...,kn:vn}) or a json string.'''
    temp = df.set_index(keys)[target].map(eval).apply(lambda x: [{target:k,val:x[k]} for k in x]).explode().dropna()
    if method=='flat':
        temp = pd.concat([temp.reset_index().drop(columns=[target]),pd.DataFrame(temp.values.tolist())],sort=False,axis=1)
    if method=='json':
        temp = pd.concat([temp.reset_index().drop(columns=[target]),pd.json_normalize(temp,*args,**kwargs)],sort=False,axis=1)
    return temp

prox_cbgv = my_explode(df=ptm,keys=keys,target='cbgv',val='visitors',method='flat')
prox_cbgd = my_explode(df=ptm,keys=keys,target='cbgd',val='visitors',method='flat')
prox_tractv = my_explode(df=ptm,keys=keys,target='tractv',val='visitors',method='flat')


# =============================================================================
# #Merge in the core information and population for each cbg

#Import some census data for normalizing (https://colab.research.google.com/drive/1lTi8JXfX9rh2mnuFjgYgKsIcMcDI3EG_?usp=sharing)
#   I will use the number of devices seen in the state by the state population to normalize.
cbg_pop = pd.read_parquet(cdd['sg_d_acs']+r'\cbg_pop.parquet')
hp = pd.read_parquet(cdd['sg_d_pm']+r'\hp.parquet')
norm = pd.read_parquet(cdd['sg_d_pm']+r'\norm.parquet').rename(columns={'region':'state'})
norm = norm.groupby(['year','month','state'])[['total_visits', 'total_devices_seen', 'total_home_visits', 'total_home_visitors']].mean().reset_index()
pop_st = cbg_pop.groupby('state')['population'].sum().reset_index()

#Convert the fips code to regions (for merging with safegraph)
state_codes = {'WA': '53', 'DE': '10', 'DC': '11', 'WI': '55', 'WV': '54', 'HI': '15',
                'FL': '12', 'WY': '56', 'PR': '72', 'NJ': '34', 'NM': '35', 'TX': '48',
                'LA': '22', 'NC': '37', 'ND': '38', 'NE': '31', 'TN': '47', 'NY': '36',
                'PA': '42', 'AK': '02', 'NV': '32', 'NH': '33', 'VA': '51', 'CO': '08',
                'CA': '06', 'AL': '01', 'AR': '05', 'VT': '50', 'IL': '17', 'GA': '13',
                'IN': '18', 'IA': '19', 'MA': '25', 'AZ': '04', 'ID': '16', 'CT': '09',
                'ME': '23', 'MD': '24', 'OK': '40', 'OH': '39', 'UT': '49', 'MO': '29',
                'MN': '27', 'MI': '26', 'RI': '44', 'KS': '20', 'MT': '30', 'MS': '28',
                'SC': '45', 'KY': '21', 'OR': '41', 'SD': '46', "AS": "60", "MP": "69",
                "VI": "78", "GU": "66"}
state_codes2 = {int(state_codes[s]):s.lower() for s in state_codes.keys()}
pop_st['state'] = pop_st['state'].map(state_codes2)
norm = pd.merge(norm,pop_st,how='left',on='state')
norm['nfactor'] = (norm['population'] / norm['total_devices_seen']).clip(upper=50)
norm['date'] = pd.to_datetime(norm['year'].astype(str)+norm['month'].astype(str).apply(lambda x: x.zfill(2))+'01',format='%Y%m%d')
norm['date'] = norm['date'].apply(lambda x: (x+relativedelta(months=+1))+relativedelta(days=-1))
norm['state'] = norm['state'].map({s.lower():state_codes[s] for s in state_codes.keys()}).fillna(norm['state'])


#Merge in a factor to inflate the visitors by the number of devices surveyed relative to state population.
def tempfunc(df,norm):
    df['state'] = df['state'].astype(str).map(state_codes)
    df.rename(columns={'date_e':'date'},inplace=True)
    df = pd.merge(df,norm[['state','date','nfactor','population','total_devices_seen']].rename(columns={'population':'pop_st','total_devices_seen':'dev_st'}),how='left',on=['state','date'])
    df['visitorsn'] = df['visitors'] * df['nfactor']
    return df
prox_cbgv = tempfunc(df=prox_cbgv,norm=norm)
prox_cbgd = tempfunc(df=prox_cbgd,norm=norm)
prox_tractv = tempfunc(df=prox_tractv,norm=norm)


#Merge in the visitor geography population
prox_cbgv = pd.merge(prox_cbgv,cbg_pop[['cbg','population']].rename(columns={'cbg':'cbgv','population':'pop_cbgv'}),how='left',on=['cbgv'])
prox_cbgd = pd.merge(prox_cbgd,cbg_pop[['cbg','population']].rename(columns={'cbg':'cbgd','population':'pop_cbgd'}),how='left',on=['cbgd'])
prox_tractv = pd.merge(prox_tractv,cbg_pop.groupby('tract')['population'].sum().reset_index().rename(columns={'tract':'tractv','population':'pop_tractv'}),how='left',on=['tractv'])


#Calculate the fraction of residents that visit the establishment.
prox_cbgv['visitorsn_p'] = prox_cbgv['visitorsn'] / prox_cbgv['pop_cbgv']
prox_cbgd['visitorsn_p'] = prox_cbgd['visitorsn'] / prox_cbgd['pop_cbgd']
prox_tractv['visitorsn_p'] = prox_tractv['visitorsn'] / prox_tractv['pop_tractv']
prox_cbgv['visitorsn_lp'] = 4 * prox_cbgv['nfactor'] / prox_cbgv['pop_cbgv']
prox_cbgd['visitorsn_lp'] = 4 * prox_cbgd['nfactor'] / prox_cbgd['pop_cbgd']
prox_tractv['visitorsn_lp'] = 4 * prox_tractv['nfactor'] / prox_tractv['pop_tractv']

#Merge in the lat and long of the cbgv, cbgd, and tractv to calculate distance.
prox_cbgv = pd.merge(prox_cbgv,cbg[['GEOID','lat','long']].rename(columns={'GEOID':'cbgv','lat':'cbgv_lat','long':'cbgv_long'}),how='left',on='cbgv')
prox_cbgd = pd.merge(prox_cbgd,cbg[['GEOID','lat','long']].rename(columns={'GEOID':'cbgd','lat':'cbgd_lat','long':'cbgd_long'}),how='left',on='cbgd')
prox_tractv = pd.merge(prox_tractv,tract[['tract','lat','long']].rename(columns={'tract':'tractv','lat':'tractv_lat','long':'tractv_long'}),how='left',on='tractv')


def tempfunc(df,base,tgt):
    df['est2orig'] = df[base].map(str) + '_' + df[tgt].map(str)
    temp = df.drop_duplicates([base,tgt])
    temp['distance_e2o'] = temp.apply(lambda x: vinc.vincenty_inverse((x[tgt+'_lat'],x[tgt+'_long']),(x['lat'],x['long'])).m ,axis=1)
    temp = dict(zip(temp['est2orig'].tolist(),temp['distance_e2o'].tolist()))
    df['distance_e2o'] = df['est2orig'].map(temp)
    df.drop(columns=['est2orig'],inplace=True)
    return df

prox_cbgv['tractv'] = prox_cbgv['cbgv'].str[:11]
prox_cbgd['tractd'] = prox_cbgd['cbgd'].str[:11]
prox_cbgv['tract'] = pd.to_numeric((np.floor(prox_cbgv['cbg']/10)),downcast='integer',errors='coerce').astype(str).str.zfill(11).str[:11]
prox_cbgd['tract'] = pd.to_numeric((np.floor(prox_cbgd['cbg']/10)),downcast='integer',errors='coerce').astype(str).str.zfill(11).str[:11]
prox_tractv['tract'] = pd.to_numeric((np.floor(prox_tractv['cbg']/10)),downcast='integer',errors='coerce').astype(str).str.zfill(11).str[:11]


prox_cbgv = tempfunc(df=prox_cbgv,base='cbg',tgt='cbgv')
prox_cbgd = tempfunc(df=prox_cbgd,base='cbg',tgt='cbgd')
prox_tractv = tempfunc(df=prox_tractv,base='tract',tgt='tractv')


#Merge in controls measured as of 2019 for cross-sectional analysis.
controls_aug = pd.read_pickle(cdd['acs_p'] + r'\tract_controls_aug_10.pkl')
cvars = ['tract','white_p', 'male_p', 'educ_sc_p', 'educ_bach_p', 'pv100_p', 'pv200_p', 'aid_p', 'snap_p', 'pubai_p', 'inc_hhmean', 'ownocc_p', 'comm_car_p', 'comm_public_p']
controls_aug  = controls_aug[~pd.isnull(controls_aug['tract'])]
controls_aug['tract'] = pd.to_numeric(controls_aug['tract'],downcast='integer').astype(str).str.zfill(11)
prox_cbgv = pd.merge(prox_cbgv,controls_aug[controls_aug.year==2017][cvars].rename(columns={'tract':'tractv'}),how='left',on='tractv')
prox_cbgd = pd.merge(prox_cbgd,controls_aug[controls_aug.year==2017][cvars].rename(columns={'tract':'tractd'}),how='left',on='tractd')
prox_tractv = pd.merge(prox_tractv,controls_aug[controls_aug.year==2017][cvars].rename(columns={'tract':'tractv'}),how='left',on='tractv')


#Merge in visits and visitors
prox_cbgv = pd.merge(prox_cbgv,ptm[['placekey','date_e','visits','visitors']].rename(columns={'date_e':'date','visits':'visits_t','visitors':'visitors_t'}),how='left',on=['placekey','date'])
prox_cbgd = pd.merge(prox_cbgd,ptm[['placekey','date_e','visits','visitors']].rename(columns={'date_e':'date','visits':'visits_t','visitors':'visitors_t'}),how='left',on=['placekey','date'])
prox_tractv = pd.merge(prox_tractv,ptm[['placekey','date_e','visits','visitors']].rename(columns={'date_e':'date','visits':'visits_t','visitors':'visitors_t'}),how='left',on=['placekey','date'])


#Merge in the normalization of the total number of surveyed devices from each cbg.
from pandas.tseries.offsets import MonthEnd
hp = pd.read_parquet(cdd['sg_d_pm']+r'\hp.parquet').rename(columns={'census_block_group':'cbg','number_devices_residing':'dev_home','number_devices_primary_daytime':'dev_day'})
hp['day']=1
hp['date'] = pd.to_datetime(hp[['year','month','day']]) + MonthEnd(1)
hp['cbg'] = hp['cbg'].astype(str).str.zfill(12)
hp['tract'] = hp['cbg'].str[:11]

prox_cbgv = pd.merge(prox_cbgv,hp[['cbg','date','dev_home']].rename(columns={'cbg':'cbgv'}),how='left',on=['cbgv','date'])
prox_cbgd = pd.merge(prox_cbgd,hp[['cbg','date','dev_day']].rename(columns={'cbg':'cbgd'}),how='left',on=['cbgd','date'])
prox_tractv = pd.merge(prox_tractv,hp.groupby(['tract','date'])['dev_home'].sum().reset_index().rename(columns={'tract':'tractv'}),how='left',on=['tractv','date'])

prox_cbgv['visitorsd_p'] = (prox_cbgv['visitors'] / prox_cbgv['dev_home']).clip(upper=1)
prox_cbgd['visitorsd_p'] = (prox_cbgv['visitors'] / prox_cbgv['dev_home']).clip(upper=1)
prox_tractv['visitorsd_p'] = (prox_tractv['visitors'] / prox_tractv['dev_home']).clip(upper=1)

#Mark rows for visitors to a plasma center that is not closest to the residence / work
prox_cbgv['closest'] = 1*(prox_cbgv['distance_e2o'] == prox_cbgv.groupby(['cbgv','date'])['distance_e2o'].transform('min'))
prox_cbgd['closest'] = 1*(prox_cbgd['distance_e2o'] == prox_cbgd.groupby(['cbgd','date'])['distance_e2o'].transform('min'))
prox_tractv['closest'] = 1*(prox_tractv['distance_e2o'] == prox_tractv.groupby(['tractv','date'])['distance_e2o'].transform('min'))


#Save out the data
prox_cbgv.to_parquet(cdd['p_d_sg_pva'] + r'\ProximityAnalysis\prox_cbgv.parquet')
prox_cbgd.to_parquet(cdd['p_d_sg_pva'] + r'\ProximityAnalysis\prox_cbgd.parquet')
prox_tractv.to_parquet(cdd['p_d_sg_pva'] + r'\ProximityAnalysis\prox_tractv.parquet')

# =============================================================================
#Import the data
prox_cbgv = pd.read_parquet(cdd['p_d_sg'] + r'\ProximityAnalysis\prox_cbgv.parquet')
prox_cbgd = pd.read_parquet(cdd['p_d_sg'] + r'\ProximityAnalysis\prox_cbgd.parquet')
prox_tractv = pd.read_parquet(cdd['p_d_sg'] + r'\ProximityAnalysis\prox_tractv.parquet')

def weighted_percentile(data, weights, perc):
    """
    perc : percentile in [0-1]!
    """
    ix = np.argsort(data)
    data = data[ix] # sort data
    weights = weights[ix] # sort weights
    cdf = (np.cumsum(weights) - 0.5 * weights) / np.sum(weights) # 'like' a CDF function
    return np.interp(perc, cdf, data)

prox_tractv[(prox_tractv['visitors']>0)&(prox_tractv['distance_e2o']<100000)]['distance_e2o'].hist(bins=100,density=True,cumulative=True)
prox_cbgv[(prox_cbgv['visitors']>0)&(prox_cbgv['distance_e2o']<100000)]['distance_e2o'].hist(bins=100,density=True,cumulative=True)


a=prox_tractv.copy()
a=a[(a['distance_e2o']<25000)&(a['visitors_t']>=10)]
a=a.sort_values(by=['id','date','distance_e2o'])
a['visitors_l'] = a['visitors'].replace(4,2)
a['frac'] = a.groupby(['placekey','date'])['visitors'].cumsum() / a.groupby(['placekey','date'])['visitors'].transform('sum')
a['dist_bins'] = np.where(a['distance_e2o']<50000, np.ceil(a['distance_e2o']/100), np.nan)
t = bal_panel({'placekey':a['placekey'].unique(),'date':a['date'].unique(),'dist_bins':a['dist_bins'].unique()})
t = pd.merge(t,a.groupby(['placekey','date','dist_bins'])['frac'].max().reset_index(),how='left',on=['placekey','date','dist_bins'])
t['t'] = t['frac']
t = fill_eff(df=t,vfill=['frac'],gp=['placekey','date'],sort={'svars':['placekey','date','dist_bins'],'sort1':[1, 1, 1],'sort2':[1, 1, 0]},limit=1000,par=True,par_ng=20)
b=t.groupby('dist_bins')['frac'].describe(percentiles=[0.1,.15,0.2,0.25,0.5,0.75,0.9]).reset_index()
b.loc[len(b)] = 0
b.sort_values(by='dist_bins',inplace=True)
b.rename(columns= {'10%':'p10', '15%':'p15', '20%':'p20', '25%':'p25', '50%':'p50', '75%':'p75', '90%':'p90'}, inplace=True)
b.to_parquet(cdd['p_d_sg_pva'] + r'\ProximityAnalysis\prox_cbgv_cdf2.parquet')

